pwnablr.kr 之 memcpy

先看源码：

// compiled with : gcc -o memcpy memcpy.c -m32 -lm
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mman.h>
#include <math.h>

unsigned long long rdtsc(){
        asm("rdtsc");
}

char* slow_memcpy(char* dest, const char* src, size_t len){
        int i;
        for (i=0; i<len; i++) {
                dest[i] = src[i];
        }
        return dest;
}

char* fast_memcpy(char* dest, const char* src, size_t len){
        size_t i;
        // 64-byte block fast copy
        if(len >= 64){
                i = len / 64;
                len &= (64-1);
                while(i-- > 0){
                        __asm__ __volatile__ (
                        "movdqa (%0), %%xmm0\n"
                        "movdqa 16(%0), %%xmm1\n"
                        "movdqa 32(%0), %%xmm2\n"
                        "movdqa 48(%0), %%xmm3\n"
                        "movntps %%xmm0, (%1)\n"
                        "movntps %%xmm1, 16(%1)\n"
                        "movntps %%xmm2, 32(%1)\n"
                        "movntps %%xmm3, 48(%1)\n"
                        ::"r"(src),"r"(dest):"memory");
                        dest += 64;
                        src += 64;
                }
        }

        // byte-to-byte slow copy
        if(len) slow_memcpy(dest, src, len);
        return dest;
}

int main(void){

        setvbuf(stdout, 0, _IONBF, 0);
        setvbuf(stdin, 0, _IOLBF, 0);

        printf("Hey, I have a boring assignment for CS class.. :(\n");
        printf("The assignment is simple.\n");

        printf("-----------------------------------------------------\n");
        printf("- What is the best implementation of memcpy?        -\n");
        printf("- 1. implement your own slow/fast version of memcpy -\n");
        printf("- 2. compare them with various size of data         -\n");
        printf("- 3. conclude your experiment and submit report     -\n");
        printf("-----------------------------------------------------\n");

        printf("This time, just help me out with my experiment and get flag\n");
        printf("No fancy hacking, I promise :D\n");

        unsigned long long t1, t2;
        int e;
        char* src;
        char* dest;
        unsigned int low, high;
        unsigned int size;
        // allocate memory
        char* cache1 = mmap(0, 0x4000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        char* cache2 = mmap(0, 0x4000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
        src = mmap(0, 0x2000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

        size_t sizes[10];
        int i=0;

        // setup experiment parameters
        for(e=4; e<14; e++){    // 2^13 = 8K
                low = pow(2,e-1);
                high = pow(2,e);
                printf("specify the memcpy amount between %d ~ %d : ", low, high);
                scanf("%d", &size);
                if( size < low || size > high ){
                        printf("don't mess with the experiment.\n");
                        exit(0);
                }
                sizes[i++] = size;
        }

        sleep(1);
        printf("ok, lets run the experiment with your configuration\n");
        sleep(1);

        // run experiment
        for(i=0; i<10; i++){
                size = sizes[i];
                printf("experiment %d : memcpy with buffer size %d\n", i+1, size);
                dest = malloc( size );

                memcpy(cache1, cache2, 0x4000);         // to eliminate cache effect
                t1 = rdtsc();
                slow_memcpy(dest, src, size);           // byte-to-byte memcpy
                t2 = rdtsc();
                printf("ellapsed CPU cycles for slow_memcpy : %llu\n", t2-t1);

                memcpy(cache1, cache2, 0x4000);         // to eliminate cache effect
                t1 = rdtsc();
                fast_memcpy(dest, src, size);           // block-to-block memcpy
                t2 = rdtsc();
                printf("ellapsed CPU cycles for fast_memcpy : %llu\n", t2-t1);
                printf("\n");
        }

        printf("thanks for helping my experiment!\n");
        printf("flag : ----- erased in this source code -----\n");
        return 0;
}

程序比较了两个内存复制算法slow_memcpy和fast_memcpy的效率。用户按程序要求输入十次要分配的内存大小，并使用malloc函数来分配内存，如果所有步骤执行完毕，程序会在最后输出 flag。

fast_memcpy函数中用于内存复制的两个指令movdqa和movntps，他们的操作数如果是内存地址的话，那么这个地址必须是16字节对齐的，否则会产生一般保护性异常导致程序退出。
malloc在分配内存时实际上分多分配4字节用于存储堆块信息，所以如果申请a字节的内存，实际上分配的是a+4字节。另外32位系统上该函数分配的内存是以8字节对齐。

每次分配大小限制在两个相邻的2次幂之间。

知道这些其实就很好筛选了

#coding:utf-8

# from libformatstr import FormatStr
# py64 = FormatStr(isx64=1) 
# py64[printf_got] = onegadget
# sl(py64.payload(start_read_offset))
from pwn import *
import sys

local = 0
# context.terminal=['tmux','splitw','-h']
if len(sys.argv) == 2 and (sys.argv[1] == 'DEBUG' or sys.argv[1] == 'debug'):
    context.log_level = 'debug'

if local:
	p = process('')
    # p = process(argv=['',pay])
    # p = process(["./ld.so","./easygame"],env={"LD_PRELOAD":"./libc.so.6"})
else:
	p = remote("pwnable.kr","9022")

#内存地址随机化
def debug(addr=0,PIE=True):
    if PIE:
        text_base = int(os.popen("pmap {}| awk '{{print $1}}'".format(p.pid)).readlines()[1], 16)
        print "breakpoint_addr --> " + hex(text_base + 0x202040)
        gdb.attach(p,'b *{}'.format(hex(text_base+addr)))
    else:
        gdb.attach(p,"b *{}".format(hex(addr))) 

sd = lambda s    :p.send(s)
rc = lambda s    :p.recv(s)
sl = lambda s    :p.sendline(s)
ru = lambda s    :p.recvuntil(s)
sda = lambda a,s :p.sendafter(a,s)
sla = lambda a,s :p.sendlineafter(a,s)

def leak(name,addr):
	log.info(name + " --> %s",hex(addr))

for i in range(3,13):
    data = "specify the memcpy amount between "+str(pow(2,i))+" ~ "+str(pow(2,i+1))+" : "
    if ((pow(2,i))+8)%16 != 0:
        pay = str(pow(2,i)+8)
    else:
        pay = str(pow(2,i))
    sla(data,pay)
p.interactive()

movntps指令

从XMM寄存器复制4个单精度浮点数至128位内存单元

The source operand is an XMM register, YMM register or ZMM register, which is assumed to contain packed single-precision, floating-pointing. The destination operand is a 128-bit, 256-bit or 512-bit memory location. The memory operand must be aligned on a 16-byte (128-bit version), 32-byte (VEX.256 encoded version) or 64-byte (EVEX.512 encoded version) boundary otherwise a general-protection exception (#GP) will be generated.
对应源操作数为XMM，则目的内存为128比特内存位置。内存必须16字节对齐，否则会生成一个 general-protection 异常。

movdqa指令

移动对齐的双四字128bit

XMM寄存器

SSE指令集提供了xmm寄存器，xmm一组8个128位寄存器，分别名为xmm0~xmm7

对应指令

movaps 把4个对准的单精度值传送到xmm寄存器或者内存
movups 把4个不对准的单精度值传送到xmm寄存器或者内存
movss 把1个单精度值传送到内存或者寄存器的低位双字
movlps 把2个单精度值传送到内存或者寄存器的低四字
movhps 把2个单精度值传送到内存或者寄存器的高四字
movlhps 把2个单精度值从低四字传送到高四字
movhlps 把2个单精度值从高四字传送到低四字

movapd 把2个对准的双精度值传送到xmm寄存器或者内存
movupd 把2个不对准的双精度值传送到xmm寄存器或者内存
movdqa 把2个对准的四字节整数传送到xmm寄存器或者内存
movdqu 把2个不对准的四字节整数传送到xmm寄存器或者内存
movsd 把1个双精度值传送到内存或者寄存器的低四字
movhpd 把1个双精度值传送到内存或者寄存器的高四字
movlpd 把1个双精度值传送到内存或者寄存器的低四字

segfault原因

程序尝试去读或者写一个非法内存地址

A segfault occurs when a reference to a variable falls outside the segment where that variable resides, or when a write is attempted to a location that is in a read-only segment.

https://bbs.pediy.com/thread-253598.htm